# coding=utf-8

from bs4 import BeautifulSoup
import urllib2
import time
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

# url = 'http://www.xiangcunxiaoshuo.com/modules/article/reader.php?aid=409528'  # 野玫瑰
url = 'http://www.xiangcunxiaoshuo.com/html/67/'    # 大主宰
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

request = urllib2.Request(url=url, headers=headers)
html = urllib2.urlopen(request).read()

soup = BeautifulSoup(html, 'html.parser')

articleList = soup.find_all('dd')

title = soup.find('section', {'class': 'ml_title'}).h1.text     # 标题


# 获取文章页面内容
def retrieve_article(url):
    article_request = urllib2.Request(url=url, headers=headers)
    article_html = urllib2.urlopen(article_request).read()
    article_soup = BeautifulSoup(article_html, 'html.parser')
    return article_soup


# 解析文章
def parse_article(article_soup):
    content = article_soup.find('div', {'class': 'yd_text2'})

    soup = BeautifulSoup(content.prettify(), 'html.parser')
    [s.extract() for s in soup(['br'])]

    return soup.prettify().replace('<div class="yd_text2">', '').replace('</div>', '')


# 写文件
def write_to_file(target_file, article_title, article_content):
    target_file.write('\n' + article_title + '\n')
    target_file.write(article_content)
    target_file.write('\n\n')

# 解析列表
for article in articleList:
    if article.a:
        time.sleep(3)
        print article.a.string
        article_soup = retrieve_article('http://www.xiangcunxiaoshuo.com' + article.a.get('href'))
        content = parse_article(article_soup)
        target_file = open(title + ".txt", 'a')
        write_to_file(target_file, article.a.string, content)
        target_file.close()
